Archive/shuffled_ttest v1.05.r

#Version 1.03
#initial program for running the shuffling of t-tests
#Data should be organized with Var 1 in column 1 and Var 2 in column 2

#runs independant sample t-test a number of times, with the data being segmented by minimum number of participants needed to reach significance

#data_set should refer to a dataset available in the global envir
#shuflle amount denotes how many times the data will be shuffled
#base n represents the minimum number of participants to select for replication
#csvFileName, if included allows the results to be added to a separate data.frame - Must be in " "

#Timing estimates
#1,000 shufflings with 30 participants took approximately 20 seconds, and resulted in ~6,000 t.tests
#10,000 shufflings with 30 participants took approximately 8 and half minutes, and resulted in 60,000 t.tests

#-------------------------------
#Required packages
packages = c("tictoc")

#use this function to check if each package is on the local machine
#if a package is installed, it will be loaded
#if any are not, the missing package(s) will be installed and loaded
package.check <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
        install.packages(x, dependencies = TRUE)
        library(x, character.only = TRUE)
    }
})

######Base n--------------------------------------------------

find_base_n <- function(data_set, alpha) {

#determine max participants
maxp <- nrow(data_set);
alpha <- alpha;
#save data_set
data_set <- as.data.frame(
	data_set, 
	row.names = NULL, 
	optional = FALSE,
	cut.names = FALSE, 
	col.names = names(data_set), 
	fix.empty.names = TRUE,
	stringsAsFactors = FALSE);

	#start analysis at 3 participants
	xrow <- 2;
	
	#initialize pvalues variable
	pvalues <- vector(mode="double",length=maxp); 
	
	#Does t-test on incrimentally increasing participants
	#i.e. 1-3, 1-4, 1-5, etc.. 
	for(k in xrow:maxp) { 
		
		#Makes sure there is variance prior to running t-test
		if (var(data_set[1:xrow,1],data_set[1:xrow,2]) != 0 ){
			#Saves iterative p values in a vector 
			pvalues[xrow] <- t.test(data_set[1:xrow,1],data_set[1:xrow,2])$p.value;
		}
		
		#continue until you reach the end of the dataset
		if (xrow <= nrow(data_set)) {
			xrow <- xrow + 1;
			}
		}
	
	i=2; #starts with 2 participants
	
	#Loop through pvalues vector to find first significant p value
	while(i <= length(pvalues)) {
	
		#Return number of participants needed for significance with a minimum number of 10 participants examined
		if (pvalues[i] <= alpha & i >= 10) {
			assign('pavlues',pvalues, envir=.GlobalEnv);
			return(i);
			}
		#Continues loop if not in the current row
		else {
			#print(i);
			i <- i + 1;
		}	
	}
	#if no significant p values are found, return 0.
	return(0);
}
######End Base n----------------------------------------------


shuffled_ttest <- function(data_set,shuffle_amount,alpha,csvFileName){
#TODO Option to get input from file

######Variable setup-----------------------------------

#convert data to data.frame
data_set <- as.data.frame(data_set, row.names = NULL, optional = FALSE,
              cut.names = FALSE, col.names = names(data_set), fix.empty.names = TRUE,
              stringsAsFactors = default.stringsAsFactors());
shuffle_amount <- shuffle_amount;
tic("Run time") #start timer

#if alpha parameter is included, save the variable
if (!missing(alpha)) {
	#Set alpha value from input
	alpha <- alpha;
	}
	
	else {
		alpha <- .05;
	}

#if csvFileName parameter is included, save the variable
if (!missing(csvFileName)) {
	#Appends '.csv' and saves desired file name as variable csvFileName
	csvFileName <- paste(csvFileName,".csv",sep="");
	}
	
#Create statistical output data frame named "results", with 7 headers, for ind. sampled t-tests, clears old data with each new run
results <- data.frame("iteration" = numeric(0), "sample" = numeric(0), "range" = character(0), "base n" = numeric(0), "t" = double(0),"df" = double(0),"p value" = double(0), stringsAsFactors = FALSE);

group1_col <- 1; #group 1 column = variable 1, change as needed
group2_col <- 2; #group 2 column = variable 2, change as needed

#used to keep track of number of significant findings
sum_sig_p <- 0;
	
#Warning for large shuffling amounts
if (shuffle_amount > 100) {
	print("Please wait...");
	}
	
######Shuffling and replication ------------------------------
	#Shuffles the data a number of times = to shuffle amount, runing the replication tests for each iteration
	for (i in 1:shuffle_amount) {
				
		cycle <- 1; #keep track of replications
		x<-1; #resets x to 1 when started a new shuffled dataset
		
		#shuffles data set using 'sample()' 
		data_set <- data_set[sample(1:nrow(data_set)),];
		
		
		base_n <- find_base_n(data_set, alpha); #finds base n for each iteration
		y <- base_n; #y=set to min number of participants needed for each shuffle
		
		#Error Check to make sure there are significant findings
		if (base_n == 0) {
			#If base n is 0, end program with error
			stop("No significant p values found!");
			}
		
		#Repeats while the current selection of participants is less than the max number of participants - does not run less than base_n number of participants, so there may be missing data at the end
		#TODO - Add option for include/exclude uneven N
		while (y <= nrow(data_set)) {
			#t test on Group 1 and Group 2 using current selection of participants x through y
			ttestresults <- t.test(data_set[x:y,group1_col],data_set[x:y,group2_col]);
			
			#if the test is signficant, increase count by 1
			if (ttestresults$p.value < alpha) {
				sum_sig_p <- sum_sig_p + 1;
				}
			
			#add statistical output to new row in results data.frame, rounding down the decimals
			#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
			results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,round(ttestresults$statistic,3), round(ttestresults$parameter,4), round(ttestresults$p.value,5));

			#Selects new range of participants of length base_n and increase cycle count
			x<-x+base_n; 
			y<-y+base_n;
			cycle <- cycle + 1;
			}
		end_time=Sys.time();
	}
#######End shuffling and replication---------------------	

	
#######Export--------------------------------------------
	#Saves results to custom external file if option is include in parameters, if none included in argument, defaults output to 'results.csv'
	if (missing(csvFileName)){
		assign('results',results, envir=.GlobalEnv);
		write.csv(results, file="results.csv", row.names=TRUE);
		} 
		
		#if there IS a name included
		else { 
		
		#writes to a csv file using the variable output_fname
		write.csv(results, file=csvFileName, row.names=TRUE);
		}
#######End Export ------------------------------------------	
	
	#shows the results in console if there are less than 50 rows
	if (nrow(results) < 50) {
		show(results);
	}
	#show relative number of successful replications
	toc();
	paste("Significant findings (p < ",alpha,"): ", sum_sig_p, "/", nrow(results),sep="");
}
baileymh/Shuffle documentation built on Sept. 4, 2019, 8:43 a.m.